1 About

This book sets out the analysis of somite development periods, for Ali Ahmed Seleit in the Aulehla Group at EMBL Heidelberg.

Video by Ali Ahmed Seleit

(#fig:unnamed-chunk-1)Video by Ali Ahmed Seleit

2 Alignment of F0 and F2 generations

All the code used to process the data using Snakemake1 can be found here:

3 Homozygosity of F0 Cab and Kaga

Snakefile for aligning F0 samples: https://github.com/brettellebi/somites/blob/master/workflow/rules/02_F0_mapping.smk

Snakefile for calling F0 samples: https://github.com/brettellebi/somites/blob/master/workflow/rules/03_F0_calling.smk

3.1 Read in total medaka genome count

# Get chromosome lengths
med_chr_lens = read.table(here::here("data",
                                     "Oryzias_latipes.ASM223467v1.dna.toplevel.fa_chr_counts.txt"),
                          col.names = c("chr", "end"))
# Add start
med_chr_lens$start = 1
# Reorder
med_chr_lens = med_chr_lens %>% 
  dplyr::select(chr, start, end) %>% 
  # remove MT
  dplyr::filter(chr != "MT")

# Total HdrR sequence length
total_hdrr_bases = sum(med_chr_lens$end)

Make custom chromosome scaffold

##Create custom genome 
med_genome = regioneR::toGRanges(med_chr_lens)

3.2 Read in data

in_dir = "/nfs/research/birney/users/ian/somites/recombination_blocks"

in_files = list.files(in_dir, pattern = "20210803_hmm_output_F0", full.names = T)

# Read into list
ck_list = purrr::map(in_files, function(FILE){
  out = readr::read_tsv(FILE,
                        col_types = "ciiidii")
})
# Set names as bin length
names(ck_list) = basename(in_files) %>% 
  stringr::str_split("_", simplify = T) %>% 
  subset(select = 6) %>% 
  stringr::str_remove(".txt")
# Reorder
ck_list = ck_list[order(as.numeric(names(ck_list)))]

counter = 0
ck_list = purrr::map(ck_list, function(data){
  counter <<- counter + 1
  # set bin length
  bin_length = as.numeric(names(ck_list)[counter])
  # add bin start and end coordinates
  df = data %>% 
    dplyr::mutate(LANE = sample %>%
                    stringr::str_split("/", simplify = T) %>% 
                    subset(select = 10),
                  BIN_LENGTH = bin_length,
                  BIN_START = (bin - 1) * bin_length + 1,
                  BIN_END = bin * bin_length,
                  BIN_LENGTH_KB = BIN_LENGTH / 1e3,
                  READS_PER_BIN = mat + pat)
  return(df)
})

# Recode `ck_list$state` so that 0,1,2 corresponds to HOM_REF, HET, HOM_ALT 
ck_list = purrr::map(ck_list, function(df){
  df = df %>% 
    dplyr::mutate(state = dplyr::recode(state,
                                        `0` = 2,
                                        `1` = 1,
                                        `2` = 0))
})

3.2.1 Get total number of bases covered by each state

# Take 5kb DF
df = ck_list$`5000`

# Set states to loop over
states = 0:2
names(states) = states

# Run loop over each LANE

base_cov_df = df %>% 
  split(., f = .$LANE) %>% 
  purrr::map(., function(LANE){
    # convert to ranges object
    lane_ranges = GenomicRanges::makeGRangesFromDataFrame(LANE,
                                                          keep.extra.columns = T,
                                                          ignore.strand = T,
                                                          seqnames.field = "chr", 
                                                          start.field = "BIN_START",
                                                          end.field = "BIN_END")
    # get total bases covered by each state
    purrr::map_dfr(states, function(STATE){
      lane_ranges[lane_ranges$state == STATE] %>% 
        # merge contiguous ranges
        GenomicRanges::reduce(.) %>% 
        # get width of ranges
        width(.) %>% 
        # get total bases covered
        sum(.) %>% 
        # coerce into data frame
        data.frame("BASES_COVERED" = .)
      }, .id = "STATE") %>% 
        # add FREQ column
        dplyr::mutate(FREQ = BASES_COVERED / total_hdrr_bases) %>% 
        # add UNCLASSIFIED row
        tibble::add_row(STATE = "UNCLASSIFIED", 
                        BASES_COVERED = total_hdrr_bases - sum(.$BASES_COVERED),
                        FREQ = (total_hdrr_bases - sum(.$BASES_COVERED)) / total_hdrr_bases)
    }
  ) %>% 
  dplyr::bind_rows(.id = "LANE")

Plot

# Plot
ck_prop_plot = base_cov_df %>% 
  dplyr::mutate(STATE = factor(STATE, levels = c(0,1,2, "UNCLASSIFIED")),
                #STATE_RECODE = dplyr::recode(STATE,
                #                             `0` = "HOM REF (HdrR)",
                #                             `1` = "HET",
                #                             `2` = "HOM ALT",
                #                             "UNCLASSIFIED" = "Unclassified")
                ) %>% 
  # plot
  ggplot(aes(STATE, FREQ, colour = STATE, fill = STATE)) +
    geom_col() +
    facet_grid(cols = vars(LANE)) +
    theme_bw() +
    scale_colour_manual(values = pal_hom_het_2_lines) +
    scale_fill_manual(values = pal_hom_het_2) +
    guides(colour = "none", fill = "none") +
    xlab("Genotype") +
    ylab("Proportion of reference bases covered")

ck_prop_plot


# Interactive version
ggplotly(ck_prop_plot)

3.3 Karyoplot

bb_list_ck = purrr::map(ck_list, function(df){
  # loop over different bin lengths
  block_bounds_list = df %>% 
    # loop over LANE
    split(., f = .$LANE) %>% 
    purrr::map(., function(LANE){
      # loop over CHR
      LANE %>% 
        split(., f = .$chr) %>% 
        purrr::map(., function(CHR){
          # Get lengths of each contiguous state
          cont_len = rle(CHR$state)
          
          # Get cumulative sum of those lengths
          cum_blocks = cumsum(cont_len$lengths)
          
          # Get rows that correspond to block changes
          block_bounds = CHR[cum_blocks, ] %>% 
            # Add end of previous black
            dplyr::mutate(END_PREV = dplyr::lag(BIN_END)) %>% 
            # Replace the NA in the first row with `1`
            dplyr::mutate(END_PREV = tidyr::replace_na(END_PREV, 1)) %>% 
            # Add colour
            dplyr::mutate(COLOUR = dplyr::recode(state,
                                                 !!!pal_hom_het_2[-which(names(pal_hom_het_2) == "UNCLASSIFIED")])) 
          
        }) %>% 
            dplyr::bind_rows()
      
  })
})

Extract y cutoff points for each y

lc_list_ck = purrr::map(bb_list_ck, function(block_bounds_list){
  lane_cutoffs = cut(0:1, breaks = length(block_bounds_list)) %>% 
    levels(.) %>% 
    data.frame(lower = as.numeric( sub("\\((.+),.*", "\\1", .) ),
               upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", .) )) %>% 
    dplyr::arrange(dplyr::desc(lower))
  return(lane_cutoffs)
})

Plot Karyoplots

counter_A = 0
purrr::map(bb_list_ck, function(block_bounds_list){
  counter_A <<- counter_A + 1
  # set file name
  file_name = paste("20210803_ck_karyoplot_", names(bb_list_ck)[counter_A], ".png", sep = "")
  file_out = here::here("docs/plots", file_name)
  
  # Get lane cutoffs
  lane_cutoffs = lc_list_ck[[counter_A]]
  
  png(file=file_out,
      width=13000,
      height=3000,
      units = "px",
      res = 300)
  
  # Plot ideogram
  kp = karyoploteR::plotKaryotype(med_genome, plot.type = 5)
  # Add data background
  #karyoploteR::kpDataBackground(kp, r0=0, r1 = 1, color = "white")
  
  # Add rectangles in loop
  counter_B = 0
  purrr::map(block_bounds_list, function(LANE){
    # Add to counter_B
    counter_B <<- counter_B + 1
    # Add rectangles
    karyoploteR::kpRect(kp,
                        chr = LANE$chr,
                        x0 = LANE$END_PREV,
                        x1 = LANE$BIN_END,
                        y0 = lane_cutoffs[counter_B, ] %>% 
                          dplyr::pull(lower),
                        y1 = lane_cutoffs[counter_B, ] %>% 
                          dplyr::pull(upper),
                        col = LANE$COLOUR,
                        border = NA)
    # Add axis label
    karyoploteR::kpAddLabels(kp, labels = unique(LANE$LANE),
                             r0 = lane_cutoffs[counter_B, ] %>% 
                               dplyr::pull(lower),
                             r1 = lane_cutoffs[counter_B, ] %>% 
                               dplyr::pull(upper),
                             cex = 0.5)
  })
  
  
  dev.off()  
})
knitr::include_graphics(here::here("book/plots/20210803_ck_karyoplot_5000.png"))
Bin length: 5 kb

(#fig:unnamed-chunk-10)Bin length: 5 kb

3.4 20211110 update

Changes:

  • Used homozgygous-divergent Cab-Kaga sites instead of all sites in F0 VCF
  • Filtered out reads that overlapped repeat regions

3.4.1 Read in data

in_dir = "/nfs/research/birney/users/ian/somites/recombination_blocks/F0/no_repeat_reads"

in_files = list.files(in_dir, full.names = T)

# Read into list
ck_list = purrr::map(in_files, function(FILE){
  out = readr::read_tsv(FILE,
                        col_types = "ciiidii")
})
# Set names as bin length
names(ck_list) = basename(in_files) %>%
  stringr::str_remove(".txt")
# Reorder
ck_list = ck_list[order(as.numeric(names(ck_list)))]

counter = 0
ck_list = purrr::map(ck_list, function(data){
  counter <<- counter + 1
  # set bin length
  bin_length = as.numeric(names(ck_list)[counter])
  # add bin start and end coordinates
  df = data %>% 
    dplyr::mutate(LANE = sample %>%
                    basename() %>% 
                    stringr::str_remove(".txt"),
                  BIN_LENGTH = bin_length,
                  BIN_START = (bin - 1) * bin_length + 1,
                  BIN_END = bin * bin_length,
                  BIN_LENGTH_KB = BIN_LENGTH / 1e3,
                  READS_PER_BIN = mat + pat)
  return(df)
})

# Recode `ck_list$state` so that 0,1,2 corresponds to Cab, Het, Kaga 
ck_list = purrr::map(ck_list, function(df){
  df = df %>% 
    dplyr::mutate(state = dplyr::recode(state,
                                        `0` = 2,
                                        `1` = 1,
                                        `2` = 0))
})

3.4.2 Get total number of bases covered by each state

# Take 5kb DF
df = ck_list$`5000`

# Set states to loop over
states = 0:2 ; names(states) = states

# Run loop over each LANE

base_cov_df = df %>% 
  split(., f = .$LANE) %>% 
  purrr::map(., function(LANE){
    # convert to ranges object
    lane_ranges = GenomicRanges::makeGRangesFromDataFrame(LANE,
                                                          keep.extra.columns = T,
                                                          ignore.strand = T,
                                                          seqnames.field = "chr", 
                                                          start.field = "BIN_START",
                                                          end.field = "BIN_END")
    # get total bases covered by each state
    purrr::map_dfr(states, function(STATE){
      lane_ranges[lane_ranges$state == STATE] %>% 
        # merge contiguous ranges
        GenomicRanges::reduce(.) %>% 
        # get width of ranges
        width(.) %>% 
        # get total bases covered
        sum(.) %>% 
        # coerce into data frame
        data.frame("BASES_COVERED" = .)
      }, .id = "STATE") %>% 
        # add FREQ column
        dplyr::mutate(FREQ = BASES_COVERED / total_hdrr_bases) %>% 
        # add UNCLASSIFIED row
        tibble::add_row(STATE = "UNCLASSIFIED", 
                        BASES_COVERED = total_hdrr_bases - sum(.$BASES_COVERED),
                        FREQ = (total_hdrr_bases - sum(.$BASES_COVERED)) / total_hdrr_bases)
    }
  ) %>% 
  dplyr::bind_rows(.id = "LANE")

Plot

# Plot
ck_prop_plot = base_cov_df %>% 
  dplyr::mutate(STATE = factor(STATE, levels = c(0,1,2, "UNCLASSIFIED"))) %>% 
  # plot
  ggplot(aes(STATE, FREQ, colour = STATE, fill = STATE)) +
    geom_col() +
    facet_grid(cols = vars(LANE)) +
    theme_bw(base_size = 9) +
    scale_colour_manual(values = pal_hom_het_2_lines) +
    scale_fill_manual(values = pal_hom_het_2) +
    guides(colour = "none", fill = "none") +
    xlab("Genotype") +
    ylab("Proportion of reference bases covered")

ck_prop_plot


# Interactive version
ggplotly(ck_prop_plot)

3.4.3 Karyoplot

bb_list_ck = purrr::map(ck_list, function(df){
  # loop over different bin lengths
  block_bounds_list = df %>% 
    # loop over LANE
    split(., f = .$LANE) %>% 
    purrr::map(., function(LANE){
      # loop over CHR
      LANE %>% 
        split(., f = .$chr) %>% 
        purrr::map(., function(CHR){
          # Get lengths of each contiguous state
          cont_len = rle(CHR$state)
          
          # Get cumulative sum of those lengths
          cum_blocks = cumsum(cont_len$lengths)
          
          # Get rows that correspond to block changes
          block_bounds = CHR[cum_blocks, ] %>% 
            # Add end of previous black
            dplyr::mutate(END_PREV = dplyr::lag(BIN_END)) %>% 
            # Replace the NA in the first row with `1`
            dplyr::mutate(END_PREV = tidyr::replace_na(END_PREV, 1)) %>% 
            # Add colour
            dplyr::mutate(COLOUR = dplyr::recode(state,
                                                 !!!pal_hom_het_2[-which(names(pal_hom_het_2) == "UNCLASSIFIED")])) 
          
        }) %>% 
            dplyr::bind_rows()
      
  })
})

Extract y cutoff points for each y

lc_list_ck = purrr::map(bb_list_ck, function(block_bounds_list){
  lane_cutoffs = cut(0:1, breaks = length(block_bounds_list)) %>% 
    levels(.) %>% 
    data.frame(lower = as.numeric( sub("\\((.+),.*", "\\1", .) ),
               upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", .) )) %>% 
    dplyr::arrange(dplyr::desc(lower))
  return(lane_cutoffs)
})

Plot Karyoplots

counter_A = 0
purrr::map(bb_list_ck, function(block_bounds_list){
  counter_A <<- counter_A + 1
  # set file name
  file_name = paste("20211110_ck_karyoplot_", names(bb_list_ck)[counter_A], ".png", sep = "")
  file_out = here::here("book/plots", file_name)
  
  # Get lane cutoffs
  lane_cutoffs = lc_list_ck[[counter_A]]
  
  png(file=file_out,
      width=13000,
      height=3000,
      units = "px",
      res = 300)
  
  # Plot ideogram
  kp = karyoploteR::plotKaryotype(med_genome, plot.type = 5)
  # Add data background
  #karyoploteR::kpDataBackground(kp, r0=0, r1 = 1, color = "white")
  
  # Add rectangles in loop
  counter_B = 0
  purrr::map(block_bounds_list, function(LANE){
    # Add to counter_B
    counter_B <<- counter_B + 1
    # Add rectangles
    karyoploteR::kpRect(kp,
                        chr = LANE$chr,
                        x0 = LANE$END_PREV,
                        x1 = LANE$BIN_END,
                        y0 = lane_cutoffs[counter_B, ] %>% 
                          dplyr::pull(lower),
                        y1 = lane_cutoffs[counter_B, ] %>% 
                          dplyr::pull(upper),
                        col = LANE$COLOUR,
                        border = NA)
    # Add axis label
    karyoploteR::kpAddLabels(kp, labels = unique(LANE$LANE),
                             r0 = lane_cutoffs[counter_B, ] %>% 
                               dplyr::pull(lower),
                             r1 = lane_cutoffs[counter_B, ] %>% 
                               dplyr::pull(upper),
                             cex = 0.5)
  })
  
  
  dev.off()  
})
knitr::include_graphics(here::here("book/plots/20211110_ck_karyoplot_5000.png"))
Bin length: 5 kb

(#fig:unnamed-chunk-17)Bin length: 5 kb

3.4.4 Without filling in empty blocks

counter = 0
bb_list_ck_wunc = purrr::map(ck_list, function(df){
  counter <<- counter + 1
  
  BIN_LENGTH = names(ck_list)[counter] %>% 
    as.numeric()
  # loop over different bin lengths
  block_bounds_list = df %>% 
    # loop over LANE
    split(., f = .$LANE) %>% 
    purrr::map(., function(LANE){
    
      STRAIN = unique(LANE$LANE)
      # Create list of possible bins
      poss_bins = purrr::map(med_chr_lens$chr, function(CHR){
        # Get chr end
        CHR_END = med_chr_lens %>% 
          dplyr::filter(chr == CHR) %>% 
          dplyr::pull(end) %>% 
          as.numeric()
        # Get bin starts
        out = tibble::tibble(chr = as.numeric(CHR),
                             BIN_START = seq(from = 1, to = CHR_END, by = BIN_LENGTH),
                             BIN_END = BIN_START + BIN_LENGTH - 1
        )
        # Adjust final bin end 
        out[nrow(out), "BIN_END"] = CHR_END
        
        return(out)
      }) %>% 
        dplyr::bind_rows()
    
      
      # Bind DF
      new_df = dplyr::left_join(poss_bins,
                                LANE %>% 
                                  dplyr::select(chr, BIN_START, BIN_END, state),
                                by = c("chr", "BIN_START", "BIN_END")) %>% 
        # replace NAs with `UNCLASSIFIED`
        dplyr::mutate(state = state %>% 
                        tidyr::replace_na("UNCLASSIFIED"),
                      # add STRAIN
                      LANE = STRAIN) %>% 
        # add COLOUR
        dplyr::mutate(COLOUR = dplyr::recode(state,
                                             !!!pal_hom_het_2))
    
              
    })
})

Plot Karyoplots

counter_A = 0
purrr::map(bb_list_ck_wunc, function(block_bounds_list){
  counter_A <<- counter_A + 1
  # set file name
  file_name = paste("20211110_ck_karyoplot_wimiss_", names(bb_list_ck_wunc)[counter_A], ".png", sep = "")
  file_out = here::here("book/plots", file_name)
  
  # Get lane cutoffs
  lane_cutoffs = lc_list_ck[[counter_A]]
  
  png(file=file_out,
      width=13000,
      height=3000,
      units = "px",
      res = 300)
  
  # Plot ideogram
  kp = karyoploteR::plotKaryotype(med_genome, plot.type = 5)
  # Add data background
  #karyoploteR::kpDataBackground(kp, r0=0, r1 = 1, color = "white")
  
  # Add rectangles in loop
  counter_B = 0
  purrr::map(block_bounds_list, function(LANE){
    # Add to counter_B
    counter_B <<- counter_B + 1
    # Add rectangles
    karyoploteR::kpRect(kp,
                        chr = LANE$chr,
                        x0 = LANE$BIN_START,
                        x1 = LANE$BIN_END,
                        y0 = lane_cutoffs[counter_B, ] %>% 
                          dplyr::pull(lower),
                        y1 = lane_cutoffs[counter_B, ] %>% 
                          dplyr::pull(upper),
                        col = LANE$COLOUR,
                        border = NA)
    # Add axis label
    karyoploteR::kpAddLabels(kp, labels = unique(LANE$LANE),
                             r0 = lane_cutoffs[counter_B, ] %>% 
                               dplyr::pull(lower),
                             r1 = lane_cutoffs[counter_B, ] %>% 
                               dplyr::pull(upper),
                             cex = 3.5)
  })
  
  
  dev.off()  
})
knitr::include_graphics(here::here("book/plots/20211110_ck_karyoplot_wimiss_5000.png"))
Bin length: 5 kb

(#fig:unnamed-chunk-20)Bin length: 5 kb

4 F2 recombination blocks (all homozygous-divergent sites)

Snakefile for aligning F2 samples: https://github.com/brettellebi/somites/blob/master/workflow/rules/04_F2_mapping.smk Snakefile for running HMM and generating figures: https://github.com/brettellebi/somites/blob/master/workflow/rules/05_F2_recomb_blocks.smk

library(here)
#> here() starts at /hps/software/users/birney/ian/repos/somites
site_filter = "all_sites"

4.1 Base coverage

4.1.1 Total

knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/base_cov_total.png"))

4.1.2 By chromosome

knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/base_cov_by_chrom.png"))

4.2 Proportion of sites

4.2.1 Total

knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/prop_sites_total.png"))

4.2.2 By chromosome

knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/prop_sites_by_chrom.png"))

4.3 Karyoplots

4.3.1 No missing blocks

knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/karyoplot_no_missing.png"))

4.3.2 With missing blocks

knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/karyoplot_with_missing.png"))

5 F2 recombination blocks (filtered sites)

Exclusions: * reads overlapping HdrR repeat regions * regions of persistent heterozygosity in the MIKK panel * filtered based on read count and proportion of Cab)

Snakefile for aligning F2 samples: https://github.com/brettellebi/somites/blob/master/workflow/rules/04_F2_mapping.smk

Snakefile for running HMM and generating figures: https://github.com/brettellebi/somites/blob/master/workflow/rules/05_F2_recomb_blocks.smk

library(here)
#> here() starts at /hps/software/users/birney/ian/repos/somites
site_filter = "no_repeat_reads_or_pers_hets_filtered_for_read_count_and_cab_prop"

5.1 Base coverage

5.1.1 Total

knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/base_cov_total.png"))

5.1.2 By chromosome

knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/base_cov_by_chrom.png"))

5.2 Proportion of sites

5.2.1 Total

knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/prop_sites_total.png"))

5.2.2 By chromosome

knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/prop_sites_by_chrom.png"))

5.3 Karyoplots

5.3.1 No missing blocks

knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/karyoplot_no_missing.png"))

5.3.2 With missing blocks

knitr::include_graphics(here::here("book/plots/snakemake", site_filter, "5000/karyoplot_with_missing.png"))

6 Association testing with simulated phenotypes

Code for generating simulated phenotypes is set out below.

The simulated phenotypes are then used in this Snakefile for testing GWLS code, specifically rule test_gwls: https://github.com/brettellebi/somites/blob/master/workflow/rules/07_assocation_testing.smk

library(here)
source(here::here("book/source/04-Association_testing.R"))

6.1 Read in F2 recombination blocks

6.1.1 Read data

in_dir = "/nfs/research/birney/users/ian/somites/recombination_blocks/20211027"

in_files = list.files(in_dir, pattern = "F2_", full.names = T)

# Read into list
data_list = purrr::map(in_files, function(FILE){
  out = readr::read_tsv(FILE,
                        col_types = "ciiidii")
})
# Set names as bin length
names(data_list) = basename(in_files) %>% 
  stringr::str_split("_", simplify = T) %>% 
  subset(select = 2) %>% 
  stringr::str_remove(".txt")
# Reorder
data_list = data_list[order(as.numeric(names(data_list)))]

counter = 0
df_list = purrr::map(data_list, function(data){
  counter <<- counter + 1
  # set bin length
  bin_length = as.numeric(names(data_list)[counter])
  # add bin start and end coordinates
  df = data %>% 
    dplyr::mutate(SAMPLE = basename(sample) %>% 
                    stringr::str_remove(".txt") %>% 
                    as.numeric(.),
                  BIN_START = (bin - 1) * bin_length + 1,
                  BIN_END = bin * bin_length) %>% 
    # recode state to make 0 == "Cab"
    dplyr::mutate(STATE = dplyr::recode(state,
                                        `0` = 2,
                                        `1` = 1,
                                        `2` = 0)) %>% 
    dplyr::select(SAMPLE, CHROM = chr, BIN = bin, BIN_START, BIN_END, STATE)
  
  return(df)
})

6.1.2 How many possible blocks have at least one call?

6.1.2.1 Count total existing bins

bin_lengths = as.integer(names(df_list))
names(bin_lengths) = bin_lengths

n_bins = purrr::map(bin_lengths, function(BIN_LENGTH){
  purrr::map_int(med_chr_lens$end, function(CHR_END){
    out = seq(from = 1, to = CHR_END, by = BIN_LENGTH)
    
    return(length(out))
  })
})

n_bins_total = purrr::map_int(n_bins, sum)

6.1.2.2 Proportion of total bins with calls

# Get total number of samples
n_samples = df_list$`5000`$SAMPLE %>%
  unique(.) %>% 
  length(.)

# Build DF of bins
n_bins_df = purrr::map_dfr(1:length(df_list), function(COUNTER){
  # Bin length
  bin_length = as.numeric(names(df_list)[COUNTER])
  # Number of total bins
  n_bins = n_bins_total[COUNTER]
  # Number of bins with calls
  n_bins_with_calls = df_list[[COUNTER]] %>% 
    dplyr::distinct(CHROM, BIN_START) %>% 
    nrow(.)
  # Number of bins with calls for each 
  n_bins_no_missing = df_list[[COUNTER]] %>% 
    dplyr::count(CHROM, BIN_START) %>%
    dplyr::filter(n == n_samples) %>%
    nrow(.)
  
  # Build final data frame
  out = tibble::tibble(BIN_LENGTH = bin_length,
                       N_BINS = n_bins,
                       N_BINS_WITH_CALLS = n_bins_with_calls,
                       N_BINS_NO_MISSING = n_bins_no_missing) %>% 
    dplyr::mutate(PROP_BINS_WITH_CALLS = N_BINS_WITH_CALLS / N_BINS,
                  PROP_BINS_NO_MISSING = N_BINS_NO_MISSING / N_BINS )
  
  return(out)
})

DT::datatable(n_bins_df)

6.1.3 Merge recombination blocks

6.1.3.1 List with final recoded genotypes (including NAs)

gt_list = purrr::map(df_list, function(BIN_LENGTH_DF){
  
  # Widen data frame
  gt_final = BIN_LENGTH_DF %>% 
    tidyr::pivot_wider(names_from = SAMPLE, values_from = STATE)
  
  # Pull out matrix of genotypes
  gt_mat = as.matrix(gt_final[, 5:ncol(gt_final)])
  
  # Get indexes of loci with > 1 genotype
  bins_to_keep = logical()
  
  for (ROW in 1:nrow(gt_mat)){
    # get unique values in each row
    out = unique(gt_mat[ROW, ])
    # remove NAs
    out = out[!is.na(out)]
    # if more than one value, return TRUE
    if (length(out) > 1) {
      bins_to_keep[ROW] = TRUE
    }
    # if just one value (i.e. if all samples are the same genotype at that locus), return false 
    else {
      bins_to_keep[ROW] = FALSE
    }
  }
  
  # filter gt_final
  gt_filt = gt_final %>% 
    dplyr::filter(bins_to_keep) %>% 
    # recode genotypes to -1, 0, 1
    dplyr::mutate(dplyr::across(-c("CHROM", "BIN", "BIN_START", "BIN_END"),
                                ~dplyr::recode(.x,
                                               `0` = -1,
                                               `1` = 0,
                                               `2` = 1))) %>% 
    # order
    dplyr::arrange(CHROM, BIN_START)  
  
  return(gt_filt)
}) 

# Show first 10 columns
gt_list$`20000`[, 1:10] %>% 
  head(.) %>% 
  DT::datatable(.) 

6.1.3.2 List with final recoded genotypes (complete cases only)

gt_nomiss_list = purrr::map(gt_list, function(BIN_LENGTH_DF){
  BIN_LENGTH_DF %>%
    dplyr::filter(complete.cases(.))
})

6.2 Simulate phenotype

6.2.1 Extract samples of genotypes

# Set directory
out_dir_test = file.path(lts_dir, "association_testing/20211027_test")

These must be written to a file because PhenotypeSimulator reads delimited genotypes from files.

# Get random 10 loci
set.seed(10)
n_loci = 10
# NOTE: PhenotypeSimulator::readStandardGenotypes states that the genotype file must
# delim: a [delimter]-delimited file of [(NrSNPs+1) x (NrSamples+1)] genotypes with the snpIDs in the first column and the sampleIDs in the first row and genotypes encoded as numbers between 0 and 2 representing the (posterior) mean genotype, or dosage of the minor allele.

sample_gts = purrr::map(seq_along(gt_nomiss_list), function(COUNTER){
  
  # Pull out random SNPs
  snp_sample = gt_nomiss_list[[COUNTER]] %>% 
    dplyr::slice_sample(n = n_loci) %>% 
    dplyr::arrange(CHROM, BIN_START) %>% 
    # create locus column
    dplyr::mutate(LOCUS = paste(CHROM, BIN_START, sep = ":")) %>% 
    # remove superfluous columns
    dplyr::select(-c(CHROM, BIN, BIN_START, BIN_END)) %>% 
    # recode back to 0,1,2
    dplyr::mutate(dplyr::across(-LOCUS,
                                ~dplyr::recode(.x,
                                               `-1` = 0,
                                               `0` = 1,
                                               `1` = 2))) %>% 
    # reorder columns
    dplyr::select(LOCUS, everything())
  
})
names(sample_gts) = names(gt_nomiss_list)

purrr::map(seq_along(sample_gts), function(COUNTER){
  # save to file
  bin_length = names(sample_gts)[COUNTER]
  out_file = file.path(out_dir_test, "target_loci", paste(bin_length, ".csv", sep = ""))
  readr::write_csv(sample_gts[[COUNTER]], out_file)
})

saveRDS(sample_gts, file.path(out_dir_test, "target_loci/sample_gts.rds"))

6.2.2 Simulate phenotype

sim_path = file.path(out_dir_test, "simulated_phenotypes/20211103_sim_phenos.rds")
set.seed(5671)
# N samples
N = n_samples
# N phenotypes
P = 1
# Proportion of total genetic variance
genVar = 0.5
# Proportion of genetic variance of genetic variant effects
h2s = 1
# Proportion of total noise variance
noiseVar = 0.5
# Proportion of noise variance of observational noise effects
phi = 1

sim_phenos = purrr::map(seq_along(sample_gts), function(COUNTER){
  # get sample file path
  bin_length = names(sample_gts)[COUNTER]
  gt_sample_file = file.path(out_dir_test, "target_loci", paste(bin_length, ".csv", sep = ""))
    
  sim_pheno = PhenotypeSimulator::runSimulation(N = N, P = P, 
                                                genVar = genVar, h2s = h2s, 
                                                noiseVar = noiseVar, phi = phi,
                                                cNrSNP = n_loci,
                                                genotypefile = gt_sample_file,
                                                format = "delim",
                                                genoDelimiter = ",")
  
  return(sim_pheno)
})
names(sim_phenos) = names(sample_gts)

saveRDS(sim_phenos, sim_path)

# Write as .xlsx to use in same Snakemake code as true GWLS
lapply(seq_along(sim_phenos), function(COUNTER){
  out = tibble::tibble(fish = colnames(sample_gts[[COUNTER]])[-1],
                       Y = sim_phenos[[COUNTER]]$phenoComponentsFinal$Y)
  # set path for output
  out_file = file.path(lts_dir, 
                       "association_testing/20211027_test/simulated_phenotypes",
                       paste(names(sim_phenos)[COUNTER], ".xlsx", sep = ""))
  # write to file
  openxlsx::write.xlsx(out, out_file, overwrite = T)
})

6.3 Reformat genotypes for GridLMM

6.3.1 Complete cases

final_nomiss = purrr::map(seq_along(gt_nomiss_list), function(COUNTER){
  out = list()
  
  # Genotypes
  out[["genotypes"]] = gt_nomiss_list[[COUNTER]] %>% 
    dplyr::select(-c(CHROM, BIN, BIN_START, BIN_END)) %>% 
    # convert to matrix
    as.matrix(.) %>% 
    # transpose to put samples as rows
    t(.) %>% 
    # convert to data frame
    as.data.frame(.)
  
  # Positions
  out[["positions"]] = gt_nomiss_list[[COUNTER]] %>% 
    dplyr::select(CHROM, BIN_START, BIN_END)
  
  # Phenotypes
  out[["phenotypes"]] = data.frame(fish = rownames(sim_phenos[[COUNTER]]$phenoComponentsFinal$Y),
                                   Y = sim_phenos[[COUNTER]]$phenoComponentsFinal$Y %>%
                                     as.numeric()
                                   )
    
  return(out)
})
names(final_nomiss) = names(gt_nomiss_list)

6.3.2 With NAs

final_wimiss = purrr::map(seq_along(gt_list), function(COUNTER){
  out = list()
  
  # Genotypes
  out[["genotypes"]] = gt_list[[COUNTER]] %>% 
    dplyr::select(-c(CHROM, BIN, BIN_START, BIN_END)) %>% 
    # convert to matrix
    as.matrix(.) %>% 
    # transpose to put samples as rows
    t(.) %>% 
    # convert to data frame
    as.data.frame(.)
  
  # Positions
  out[["positions"]] = gt_list[[COUNTER]] %>% 
    dplyr::select(CHROM, BIN_START, BIN_END)
  
  # Phenotypes
  out[["phenotypes"]] = data.frame(SAMPLE = rownames(sim_phenos[[COUNTER]]$phenoComponentsFinal$Y),
                                   Y = sim_phenos[[COUNTER]]$phenoComponentsFinal$Y %>%
                                     as.numeric()
                                   )
    
  return(out)
})
names(final_wimiss) = names(gt_list)

6.4 Test GridLMM

6.4.1 No missing

6.4.1.1 Run GWAS

test_out_nomiss = file.path(out_dir_test, "gwls_results", "gwas_results_nomiss.rds")
gwas_tests_nomiss = purrr::map(final_nomiss, function(BIN_LENGTH){
  run_gwas(d = BIN_LENGTH$genotypes,
           m = BIN_LENGTH$positions,
           p = BIN_LENGTH$phenotypes)
})

saveRDS(gwas_tests_nomiss, test_out_nomiss)

6.4.1.2 Plot

# Create custom Manhattan plot

test_man_nomiss = lapply(seq_along(gwas_tests_nomiss), function(COUNTER){
  # Get bin length
  BIN_LENGTH = names(gwas_tests_nomiss)[COUNTER] %>% 
    as.numeric(.)
  
  # Clean data frame
  test_results = gwas_tests_nomiss[[COUNTER]]$results %>% 
    dplyr::left_join(med_chr_lens, by = c("Chr" = "chr")) %>% 
    # add x-coord
    dplyr::mutate(X_COORD = pos + TOT) %>% 
    # change column names
    dplyr::rename(CHROM = Chr, BIN_START = pos) %>% 
    # add BIN_END
    dplyr::mutate(BIN_END = BIN_START + BIN_LENGTH - 1) %>% 
    # add locus
    dplyr::mutate(LOCUS = paste(CHROM, BIN_START, sep = ":")) %>%
    # target or not
    dplyr::mutate(TARGET = dplyr::if_else(LOCUS %in% sample_gts[[COUNTER]]$LOCUS,
                                          "yes",
                                          "no"),
                  TARGET = factor(TARGET, levels = c("yes", "no"))) %>% 
    # create vector of colours
    dplyr::mutate(COLOUR = dplyr::case_when(TARGET == "yes" ~ names(gwas_pal)[1],
                                            gtools::even(CHROM) ~ names(gwas_pal)[2],
                                            gtools::odd(CHROM) ~ names(gwas_pal)[3]),
                  # order so that `target` is plotted last, at the front
                  COLOUR = factor(COLOUR, levels = rev(names(gwas_pal))),
                  SHAPE = dplyr::if_else(TARGET == "yes",
                                         18,
                                         20),
                  SIZE = dplyr::if_else(TARGET == "yes",
                                         1,
                                         0.5),
                  ALPHA = dplyr::if_else(TARGET == "yes",
                                         1,
                                         0.5)
                  )
  
  # Plot
  p1 = test_results %>% 
    ggplot(aes(x = X_COORD,
               y = -log10(p_value_REML),
               colour = COLOUR,
               shape = SHAPE,
               size = SIZE,
               alpha = ALPHA,
               label = CHROM,
               label2 = BIN_START,
               label3 = BIN_END)) + 
    geom_point() +
    aes(group = rev(TARGET)) +
    scale_color_manual(values = gwas_pal) +
    scale_shape_identity() +
    scale_size_identity() +
    scale_alpha_identity() +
    scale_x_continuous(breaks = med_chr_lens$MID_TOT, 
                       labels = med_chr_lens$chr) +
    theme_bw() +
    theme(panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank()
    ) +
    guides(colour = "none") + 
    ggtitle(paste("Bin length:", BIN_LENGTH)) +
    xlab("Chromosome") +
    ylab("-log10(p-value)") + 
    geom_hline(yintercept = significance_line, colour = "#AAF683", linetype = "dashed") +
    geom_hline(yintercept = suggestive_line, colour = "#60D394", linetype = "dashed")
  
  out = ggplotly(p1, tooltip = c("CHROM", "BIN_START", "BIN_END"))
  
  return(out)
})

6.4.2 Include loci with missing genotypes

6.4.2.1 Run GWAS

test_out_wimiss = file.path(out_dir_test, "gwls_results", "gwas_results_wimiss.rds")
gwas_tests_wimiss = purrr::map(final_wimiss, function(BIN_LENGTH){
  run_gwas(d = BIN_LENGTH$genotypes,
           m = BIN_LENGTH$positions,
           p = BIN_LENGTH$phenotypes)
})

saveRDS(gwas_tests_wimiss, test_out_wimiss)

6.4.2.2 Read in results from Snakemake script

gwas_tests_wimiss = purrr::map(bin_lengths, function(BIN_LENGTH){
  readRDS(file.path(lts_dir, "association_testing/20211104_test/test_results", paste(BIN_LENGTH, ".rds", sep = "")))
})

6.4.2.3 Plot

# Create custom Manhattan plot
gwas_pal = c("#2B2D42", "#F7B267", "#F25C54")
names(gwas_pal) = c("target", "even chr", "odd chr")
significance_line = 3.6
suggestive_line = 2.9

test_man_wimiss = purrr::map(seq_along(gwas_tests_wimiss), function(COUNTER){
  # Get bin length
  BIN_LENGTH = names(gwas_tests_wimiss)[COUNTER] %>% 
    as.numeric(.)
  
  # Clean data frame
  test_results = gwas_tests_wimiss[[COUNTER]]$results %>% 
    dplyr::left_join(med_chr_lens, by = c("Chr" = "chr")) %>% 
    # add x-coord
    dplyr::mutate(X_COORD = pos + TOT) %>% 
    # change column names
    dplyr::rename(CHROM = Chr, BIN_START = pos) %>% 
    # add BIN_END
    dplyr::mutate(BIN_END = BIN_START + BIN_LENGTH - 1) %>% 
    # add locus
    dplyr::mutate(LOCUS = paste(CHROM, BIN_START, sep = ":")) %>%
    # target or not
    dplyr::mutate(TARGET = dplyr::if_else(LOCUS %in% sample_gts[[COUNTER]]$LOCUS,
                                          "yes",
                                          "no"),
                  TARGET = factor(TARGET, levels = c("yes", "no"))) %>% 
    # create vector of colours
    dplyr::mutate(COLOUR = dplyr::case_when(TARGET == "yes" ~ names(gwas_pal)[1],
                                            gtools::even(CHROM) ~ names(gwas_pal)[2],
                                            gtools::odd(CHROM) ~ names(gwas_pal)[3]),
                  # order so that `target` is plotted last, at the front
                  COLOUR = factor(COLOUR, levels = rev(names(gwas_pal))),
                  SHAPE = dplyr::if_else(TARGET == "yes",
                                         18,
                                         20),
                  SIZE = dplyr::if_else(TARGET == "yes",
                                         1,
                                         0.5),
                  ALPHA = dplyr::if_else(TARGET == "yes",
                                         1,
                                         0.5)
                  )
  
  # Plot
  p1 = test_results %>% 
    ggplot(aes(x = X_COORD,
               y = -log10(p_value_REML),
               colour = COLOUR,
               shape = SHAPE,
               size = SIZE,
               alpha = ALPHA,
               label = CHROM,
               label2 = BIN_START,
               label3 = BIN_END)) + 
    geom_point() +
    aes(group = rev(TARGET)) +
    scale_color_manual(values = gwas_pal) +
    scale_shape_identity() +
    scale_size_identity() +
    scale_alpha_identity() +
    scale_x_continuous(breaks = med_chr_lens$MID_TOT, 
                       labels = med_chr_lens$chr) +
    theme_bw() +
    theme(panel.grid.major.x = element_blank(),
          panel.grid.minor.x = element_blank()
    ) +
    guides(colour = "none") + 
    ggtitle(paste("Bin length:", BIN_LENGTH)) +
    xlab("Chromosome") +
    ylab("-log10(p-value)") + 
    geom_hline(yintercept = significance_line, colour = "#AAF683", linetype = "dashed") +
    geom_hline(yintercept = suggestive_line, colour = "#60D394", linetype = "dashed")
  
  out = ggplotly(p1, tooltip = c("CHROM", "BIN_START", "BIN_END"))
  
  return(out)

})

test_man_wimiss[[1]]
test_man_wimiss[[2]]
test_man_wimiss[[3]]
test_man_wimiss[[4]]

6.5 Get significance levels from permutations

perm_file = here::here("data/20211109_permutation_mins.csv")
PERMUTATIONS_DIR = file.path(lts_dir, "association_testing/20211109_permutations")
SITE_FILTERS = c("all_sites", "no_repeat_sites", "no_repeat_reads"); names(SITE_FILTERS) = SITE_FILTERS
TARGET_PHENOS = c("mean", "intercept"); names(TARGET_PHENOS) = TARGET_PHENOS
BIN_LENGTHS = c(5000, 10000, 15000, 20000); names(BIN_LENGTHS) = BIN_LENGTHS
N_PERMUTATIONS = 1:10

perms_df = purrr::map_dfr(SITE_FILTERS, function(SITE_FILTER){
  purrr::map_dfr(TARGET_PHENOS, function(TARGET_PHENO){
    purrr::map_dfr(BIN_LENGTHS, function(BIN_LENGTH){
      purrr::map_dfr(N_PERMUTATIONS, function(N_PERM){
        in_list = readRDS(file.path(PERMUTATIONS_DIR, SITE_FILTER, TARGET_PHENO, BIN_LENGTH, paste(N_PERM, ".rds", sep = "")))
        # Pull out minimum
        out_df = tibble::tibble(MIN_P = in_list$results$p_value_REML %>%
                                  min()
        )
      }, .id = "PERMUTATION")
    }, .id = "BIN_LENGTH")
  }, .id = "TARGET_PHENO")
}, .id = "SITE_FILTER")

# Get minimum
perms_df_mins = perms_df %>% 
  dplyr::group_by(SITE_FILTER, TARGET_PHENO, BIN_LENGTH) %>% 
  dplyr::summarise(MIN_P = min(MIN_P))

readr::write_csv(perms_df_mins,
                 perm_file)
perms_df_mins %>% 
  DT::datatable()

7 GWLS results

library(here)
source(here::here("book/source/04-Association_testing.R"))

7.1 Notes

  • 20211104 association test performed on full sites file. Results here: /nfs/research/birney/users/ian/somites/association_testing/20211104_true/results
  • 20211109 association test performed on sites excluding those overlapping repeat regions. Results here: /nfs/research/birney/users/ian/somites/association_testing/20211109_true/results

7.2 Phenotype data

First 400 phenotype data: https://github.com/brettellebi/somites/tree/master/data/20210917_First400_F2_DF.xlsx.

I also attach here the DataFrame of the phenotyping of the first 400 F2s (First400_F2_DF.xlsx)…

Of interest for us for the association testing are the: intercept period -> intercept in the table mean period -> mean in the table

7.3 Snakemake rules

Snakemake rules for running the GWAS over these phenotypes: https://github.com/brettellebi/somites/blob/master/workflow/rules/07_assocation_testing.smk

7.4 Results

7.4.1 All sites

7.4.1.1 Read in files

target_phenos = c("mean", "intercept")
names(target_phenos) = target_phenos
bin_lengths = c(5000, 10000, 15000, 20000)
names(bin_lengths) = bin_lengths
results_dir = "/nfs/research/birney/users/ian/somites/association_testing/20211109_true/all_sites/results/"

gwas_true = purrr::map(target_phenos, function(PHENO){
  purrr::map(bin_lengths, function(BIN_LENGTH){
    readRDS(file.path(results_dir, PHENO, paste(BIN_LENGTH, ".rds", sep = "")))
  })
})

7.4.1.2 Manhattans

Process data

plot_dat = purrr::map(seq_along(gwas_true), function(COUNTER_1){
  bin_list = purrr::map(seq_along(gwas_true[[COUNTER_1]]), function(COUNTER_2){
    BIN_LENGTH = names(gwas_true[[COUNTER_1]])[COUNTER_2] %>% 
      as.numeric()
    out = clean_gwas_res(gwas_true[[COUNTER_1]][[COUNTER_2]],
                         bin_length = BIN_LENGTH,
                         chr_lens = med_chr_lens)
    
    return(out)
  })
  names(bin_list) = names(gwas_true[[COUNTER_1]])
  
  return(bin_list)
})
names(plot_dat) = names(gwas_true)

Read in significance levels from permutations

perm_file = here::here("data/20211109_permutation_mins.csv")
perms_df_mins = readr::read_csv(perm_file,
                                col_types = c("ccid"))

Plot

plot_dir = here::here("book/plots/20211109/gwls_results/all_sites")
# Plot
lapply(seq_along(plot_dat), function(COUNTER_PHENO){
  lapply(seq_along(plot_dat[[COUNTER_PHENO]]), function(COUNTER_BINL){
    # Get pheno
    PHENO = names(plot_dat)[COUNTER_PHENO]
    # Get bin length
    BINL = names(plot_dat[[COUNTER_PHENO]])[COUNTER_BINL] %>% 
      as.numeric()
    # Get significant level
    SIG_LEVEL = perms_df_mins %>%
      dplyr::filter(SITE_FILTER == "all_sites", TARGET_PHENO == PHENO, BIN_LENGTH == BINL) %>% 
      dplyr::pull(MIN_P) %>% 
      -log10(.)
    
    # Plot
    out = plot_man(plot_dat[[COUNTER_PHENO]][[COUNTER_BINL]],
             phenotype = PHENO,
             bin_length = BINL,
             gwas_pal = gwas_pal[2:3],
             med_chr_lens = med_chr_lens,
             sig_line = SIG_LEVEL) +
      ylim(0,7)
    
    ggsave(file.path(plot_dir, paste(PHENO, "_", BINL, ".png", sep = "")),
           out,
           device = "png",
           width = 9.6,
           height = 6,
           units = "in",
           dpi = 400)
    
    out
  })
})
#> [[1]]
#> [[1]][[1]]

#> 
#> [[1]][[2]]

#> 
#> [[1]][[3]]

#> 
#> [[1]][[4]]

#> 
#> 
#> [[2]]
#> [[2]][[1]]

#> 
#> [[2]][[2]]

#> 
#> [[2]][[3]]

#> 
#> [[2]][[4]]

7.4.2 Excluding reads overlapping repeats

7.4.2.1 Read in files

filter_type = "no_repeat_reads"
target_phenos = c("mean", "intercept")
names(target_phenos) = target_phenos
bin_lengths = c(5000, 10000, 15000, 20000)
names(bin_lengths) = bin_lengths
results_dir = file.path("/nfs/research/birney/users/ian/somites/association_testing/20211109_true", filter_type, "results")

gwas_true = purrr::map(target_phenos, function(PHENO){
  purrr::map(bin_lengths, function(BIN_LENGTH){
    readRDS(file.path(results_dir, PHENO, paste(BIN_LENGTH, ".rds", sep = "")))
  })
})

7.4.2.2 Manhattans

Process data

plot_dat = purrr::map(seq_along(gwas_true), function(COUNTER_1){
  bin_list = purrr::map(seq_along(gwas_true[[COUNTER_1]]), function(COUNTER_2){
    BIN_LENGTH = names(gwas_true[[COUNTER_1]])[COUNTER_2] %>% 
      as.numeric()
    out = clean_gwas_res(gwas_true[[COUNTER_1]][[COUNTER_2]],
                         bin_length = BIN_LENGTH,
                         chr_lens = med_chr_lens)
    
    return(out)
  })
  names(bin_list) = names(gwas_true[[COUNTER_1]])
  
  return(bin_list)
})
names(plot_dat) = names(gwas_true)

Read in significance levels from permutations

perm_file = here::here("data/20211109_permutation_mins.csv")
perms_df_mins = readr::read_csv(perm_file,
                                col_types = c("ccid"))

Plot

plot_dir = here::here(file.path("book/plots/20211109/gwls_results", filter_type))
# Plot
lapply(seq_along(plot_dat), function(COUNTER_PHENO){
  lapply(seq_along(plot_dat[[COUNTER_PHENO]]), function(COUNTER_BINL){
    # Get pheno
    PHENO = names(plot_dat)[COUNTER_PHENO]
    # Get bin length
    BINL = names(plot_dat[[COUNTER_PHENO]])[COUNTER_BINL] %>% 
      as.numeric()
    # Get significant level
    SIG_LEVEL = perms_df_mins %>%
      dplyr::filter(SITE_FILTER == filter_type, TARGET_PHENO == PHENO, BIN_LENGTH == BINL) %>% 
      dplyr::pull(MIN_P) %>% 
      -log10(.)
    
    # Plot
    out = plot_man(plot_dat[[COUNTER_PHENO]][[COUNTER_BINL]],
             phenotype = PHENO,
             bin_length = BINL,
             gwas_pal = gwas_pal[2:3],
             med_chr_lens = med_chr_lens,
             sig_line = SIG_LEVEL) +
      ylim(0,7)
    
    ggsave(file.path(plot_dir, paste(PHENO, "_", BINL, ".png", sep = "")),
           out,
           device = "png",
           width = 9.6,
           height = 6,
           units = "in",
           dpi = 400)
    
    out
  })
})
#> [[1]]
#> [[1]][[1]]

#> 
#> [[1]][[2]]

#> 
#> [[1]][[3]]

#> 
#> [[1]][[4]]

#> 
#> 
#> [[2]]
#> [[2]][[1]]

#> 
#> [[2]][[2]]

#> 
#> [[2]][[3]]

#> 
#> [[2]][[4]]

8 Notes

8.1 2 December 2021

Ali has sent the third batch of F2 data.

Sample 323 is missing the first pair.

8.2 26 November 2021

Suggestions from EB:

  • Set up guardrails – e.g. the site must have at least 0.2 to 0.8 iCab vs Total
  • Limit to >= 100 and <= 10000
  • Plot chromosomes before filtering

8.3 17 November 2021

Other HMM genotyping by sequencing (GBS) methods:

8.4 16 November 2021

2 (TIGER)

  • Trained Individual GenomE Reconstruction > The number of crossovers is typically limited to only one or two per chromosome pair per meiosis.

3

  • Software / pipeline: https://github.com/CarlborgGenomics/Stripes/tree/master/scripts

  • Used 30x sequence data from founders, and <0.5x sequence data on intercross individuals to infer the founder mosaic genotypes of intercross individuals.

  • 7.6M markers segregated, and 10-13.7% of these fixed for alternative alleles in the founders.

  • 95% agreement between genotypes from low-coverage data, and those obtained through SNP genotyping.

  • Estimated resolution of the inferred recombination breakpoints was relatively high, with 50% of them defined on regions shorter than 10 kb.

  • Implements TIGER (referenced above)

  • Experimenting with calling F0 and F2 together, to get:

  • Differences in numbers of sites called:

    • Original F0 only: 22618365 variants (18758908 SNPs)
    • F0 with F2:
  • bcftools’s roh results:

8.5 23 September 2021

The following samples failed during the alignment of the second batch of F2 individuals: config/20210923_f2_fails.txt. See that file for error types.

8.6 22 September 2021

  • Ali has finished uploading the second batch of F2 sequences.
  • Tom has given me the GridLMM code, which is saved to code/association_testing/gridLMM_gwas.R

8.7 31 August 2021

8.7.1 Location on EMBL server:

We use AnyConnect

vpn-gw1.embl.de

Server:

smb://aulehla

Username:

seleit

Location smb://aulehla/aulehla/NGS_Data_Library/2021-08-12-AAAG3YWHV

8.7.2 ascli

They can use ascli from the terminal. All the info is at https://intranet.embl.de/it_services/services/data_sharing/aspera/index.html.

ascli docs: https://www.rubydoc.info/gems/aspera-cli

Examples from docs:

ascli faspex health
ascli faspex package list
ascli faspex package list --box=sent --fields=package_id --format=csv --display=data|tail -n 1);\
ascli faspex package list --fields=package_id --format=csv --display=data|tail -n 1);\
ascli faspex package recv --to-folder=. --box=sent --id="my_package_id"
ascli faspex package recv --to-folder=. --id="my_package_id"
ascli faspex package recv --to-folder=. --id=ALL --once-only=yes
ascli faspex package recv --to-folder=. --link="my_faspex_publink_recv_from_fxuser"
ascli faspex package send --delivery-info=@json:'{"title":"Important files delivery","recipients":["internal.user@example.com","FASPEX_USERNAME"]}' testfile.bin
ascli faspex package send --link="my_faspex_publink_send_to_dropbox" --delivery-info=@json:'{"title":"Important files delivery"}' testfile.bin
ascli faspex package send --link="my_faspex_publink_send_to_fxuser" --delivery-info=@json:'{"title":"Important files delivery"}' testfile.bin
ascli faspex source name "Server Files" node br /
ascli faspex5 node list --value=@json:'{"type":"received","subtype":"mypackages"}'
ascli faspex5 package list --value=@json:'{"mailbox":"inbox","state":["released"]}'
ascli faspex5 package receive --id="my_package_id" --to-folder=.
ascli faspex5 package send --value=@json:'{"title":"test title","recipients":[{"name":"${f5_user}"}]}' testfile.bin

Test:

# On EBI cluster
cd /nfs/ftp/private/indigene_ftp/upload/Ali/Kaga-Cab_F2_Fish201-400_WGS/

## Example
ascli faspex package recv --to-folder=. --link="my_faspex_publink_recv_from_fxuser"

## Tests
ascli faspex package recv --to-folder=. --link="https://faspex.embl.de/aspera/faspex/external_deliveries/4366?passcode=221f3482a4d9b2fd87da2d21a2916156c2c1870d&expiration=MjAyMS0wOS0wN1QxMjo0Mjo0NVo="
#W, [2021-08-31T13:46:21.054223 #411589]  WARN -- : A new version is available: 4.2.0. You #have 4.1.0. Upgrade with: gem update 
#ascp: Error creating illegal char conversion table EINVAL, exiting.
#E, [2021-08-31T13:46:27.329374 #411589] ERROR -- : code: 11
#W, [2021-08-31T13:46:27.329485 #411589]  WARN -- : An error occured: ascp failed with code #1
#W, [2021-08-31T13:46:27.329507 #411589]  WARN -- : resuming in  2 seconds (retry left:6)
#ascp: Error creating illegal char conversion table EINVAL, exiting.
#E, [2021-08-31T13:46:30.441640 #411589] ERROR -- : code: 11
#W, [2021-08-31T13:46:30.441728 #411589]  WARN -- : An error occured: ascp failed with code #1
#E, [2021-08-31T13:46:30.441748 #411589] ERROR -- : non-retryable error
#ERROR: FASP(ascp]: ascp failed with code 1

ascli faspex package recv --to-folder=. --link="https://faspex.embl.de/aspera/faspex/external_deliveries/4366?passcode=221f3482a4d9b2fd87da2d21a2916156c2c1870d&expiration=MjAyMS0wOS0wN1QxMjo0Mjo0NVo=#"
# As above

ascli faspex package recv --to-folder=. --link="https://faspex.embl.de/aspera/faspex/external_deliveries/4366?passcode=221f3482a4d9b2fd87da2d21a2916156c2c1870d&expiration=MjAyMS0wOS0wN1QxMjo0Mjo0NVo"
# As above

ascli faspex package recv --to-folder=. --link=https://faspex.embl.de/aspera/faspex/external_deliveries/4366?passcode=221f3482a4d9b2fd87da2d21a2916156c2c1870d&expiration=MjAyMS0wOS0wN1QxMjo0Mjo0NVo=#
# As above

ascli faspex package recv --to-folder=. --delivery-info="https://faspex.embl.de/aspera/faspex/external_deliveries/4366?passcode=221f3482a4d9b2fd87da2d21a2916156c2c1870d&expiration=MjAyMS0wOS0wN1QxMjo0Mjo0NVo="
# ERROR: Argument: Missing mandatory option: id

ascli faspex package recv --id=brettell@ebi.ac.uk --to-folder=. --delivery-info="https://faspex.embl.de/aspera/faspex/external_deliveries/4366?passcode=221f3482a4d9b2fd87da2d21a2916156c2c1870d&expiration=MjAyMS0wOS0wN1QxMjo0Mjo0NVo="
# ERROR: Argument: Missing mandatory option: url

ascli faspex package recv --id=brettell@ebi.ac.uk --to-folder=. --url="https://faspex.embl.de/aspera/faspex/external_deliveries/4366?passcode=221f3482a4d9b2fd87da2d21a2916156c2c1870d&expiration=MjAyMS0wOS0wN1QxMjo0Mjo0NVo="
# ERROR: Argument: Missing mandatory option: username

ascli faspex package recv --id=brettell@ebi.ac.uk --username=brettell@ebi.ac.uk --to-folder=. --url="https://faspex.embl.de/aspera/faspex/external_deliveries/4366?passcode=221f3482a4d9b2fd87da2d21a2916156c2c1870d&expiration=MjAyMS0wOS0wN1QxMjo0Mjo0NVo="
# ERROR: Argument: Missing mandatory option: password

ascli faspex package recv --to-folder=. --transfer=httpgw --transfer-info=@json:'{"url":"https://faspex.embl.de/aspera/faspex/external_deliveries/4366?passcode=221f3482a4d9b2fd87da2d21a2916156c2c1870d&expiration=MjAyMS0wOS0wN1QxMjo0Mjo0NVo=#"}'

8.7.3 lftp

Hi, You can use the lftp command present in datatransfer.embl.de (here is a tutorial) https://linuxconfig.org/lftp-tutorial-on-linux-with-examples . If you want to send files that are in your group share, they will be available at /g/ location. Cheers.


Josep Manel Andrés Moscardó Systems Engineer, IT Operations EMBL Heidelberg T +49 6221 387-8394

8.8 3 August 2021

Added Cab and Kaga karyoplots.

Removed use of (almost) all wrappers in Snakemake pipeline. Now all rules are containerised other than for pulling the reference sequence from Ensembl.

NOTE: Install plotly and DT in R container.

8.9 15 July 2021

Notes for Ali:

  • Tried sample 171 again (the one you uploaded again), and it failed again at the alignment stage with the following error:
[mem_sam_pe_batch_post] paired reads have different names: "�0�����O&�����x�7�Z�HĻ�@�%Q�ü��", "ST-K00119:220:HKNVLBBXY:7:1101:1661:1138"
  • Fixed the sample names so that they don’t include the pool number at the beginning, e.g. sample 171 is no longer called 7171

  • Cab has 15M homozygous sites, whereas Kaga only has 5M. Reason: Cab aligns better to HdrR, so there are many more 0/0 calls?

  • 3.48M homozygous sites after joining Cab and Kaga homozygous site DFs.

  • 2.7M homozygous divergent sites.

8.10 13 July 2021

8.10.1 Attendees:

  • Ewan Birney
  • Alexander Aulehla
  • Tomas Fitzgerald
  • Ali Seleit
  • Ian Brettell

8.10.2 Notes:

  • Genetics:

    • Increase block size?

      • Try site-by-site
      • HMM would need to work on a count system so that it doesn’t get skewed
  • ACTION (Ian):

    • Check colours:

      • Would expect Cab skew, especially as it has the reporter.
  • ACTION (Ian):

    • Find number of reads covering each block for each individual.

    • If they reach < 5, increase the block size.

    • Also check out chr 16 reporter locus.

    • Make histogram of counts per block for each individual.

    • Data frame:

      • Fish ID
      • Block ID
      • Allele
      • Count
    • Plot a boxplot of counts per allele for all blocks per individual.

    • Do for 5, 10 and 15 kb windows, then make boxplots to compare.

    • Expect coverage across the homo-divergent regions is quite uneven.

    • Also estimate heritability?

  • ACTION (Ali):

    • Eyeball high phenotype points

    • Also look at the plate position.

    • Is there a between-day effect? (Note: Can only image 10 individuals per day)

    • Run a Kruskal-Wallis test for:

      • day
      • microscope (if they differ)
      • any other way you can split it up
      • look at the within-individual variance too
  • Re: sample size:

    • Pause imaging while we check the above.
    • Want to see the parents tight, and the F2 stretching out.

References

Mölder, Felix, Kim Philipp Jablonski, Brice Letcher, Michael B. Hall, Christopher H. Tomkins-Tinch, Vanessa Sochat, Jan Forster, et al. “Sustainable Data Analysis with Snakemake.” F1000Research 10 (April 19, 2021): 33. https://doi.org/10.12688/f1000research.29032.2.

  1. Felix Mölder et al., “Sustainable Data Analysis with Snakemake,” F1000Research 10 (April 19, 2021): 33, https://doi.org/10.12688/f1000research.29032.2.↩︎

  2. rowan2015?↩︎

  3. zan2019?↩︎